The datasets are quite similar in general
Autism samples have higher levels of expression (again! This is weird)
Overexpressed genes have a higher level of expression than under expressed genes (again!)
DE genes separate into two clouds based on the sign of their lfc
This dataset has five times less DE genes than Gandal, with only 4% of its genes being DE
There don’t seem to be underlying distributions when plotting genes by mean expression or by standard deviation anymore
library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(plotlyutils)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally)
library(biomaRt) ; library(DESeq2)
library(Rtsne)
library(ClusterR)
library(knitr)
Load preprocessed dataset (preprocessing code in 20_03_30_data_preprocessing.Rmd)
# Gandal dataset
load('./../Data/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
DE_info = DE_info %>% data.frame
# GO Neuronal annotations: regex 'neuron' in GO functional annotations and label the genes that make a match as neuronal
GO_annotations = read.csv('./../Data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>%
mutate('ID'=as.character(ensembl_gene_id)) %>%
dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
mutate('Neuronal'=1)
# Update DE_info with Neuronal information
DE_info = DE_info %>% mutate('ID'=rownames(.)) %>% left_join(GO_neuronal, by='ID') %>%
mutate(Neuronal=ifelse(is.na(Neuronal), 0, Neuronal)) %>%
mutate(significant=padj<0.05 & !is.na(padj))
rm(GO_annotations)
plot_data = data.frame('ID'=rownames(datExpr), 'Mean'=rowMeans(datExpr))
p1 = ggplotly(plot_data %>% ggplot(aes(Mean)) + geom_density(color='#0099cc', fill='#0099cc', alpha=0.3) +
scale_x_log10() + theme_minimal())
plot_data = data.frame('ID'=colnames(datExpr), 'Mean'=colMeans(datExpr))
p2 = ggplotly(plot_data %>% ggplot(aes(Mean)) + geom_density(color='#0099cc', fill='#0099cc', alpha=0.3) +
theme_minimal() + ggtitle('Mean expression density by Gene (left) and by Sample (right)'))
subplot(p1, p2, nrows=1)
rm(p1, p2, plot_data)
The genes with Neuronal function have a higher expression distribution than the ones without it
The ASD group has higher levels of expression than the Control group
plot_data = data.frame('ID'=rownames(datExpr), 'Mean'=rowMeans(datExpr)) %>%
left_join(GO_neuronal, by='ID') %>% mutate('Neuronal'=ifelse(is.na(Neuronal),F,T))
p1 = plot_data %>% ggplot(aes(Mean, color=Neuronal, fill=Neuronal)) + geom_density(alpha=0.3) +
scale_x_log10() + theme_minimal() + theme(legend.position='bottom') +
ggtitle('Mean expression density by gene')
plot_data = data.frame('ID'=colnames(datExpr), 'Mean'=colMeans(datExpr)) %>%
left_join(datMeta, by=c('ID'='ID'))
p2 = plot_data %>% ggplot(aes(Mean, color=Diagnosis, fill=Diagnosis)) + geom_density(alpha=0.3) +
theme_minimal() + theme(legend.position='bottom') +
ggtitle('Mean expression density by Sample')
grid.arrange(p1, p2, nrow=1)
rm(GO_annotations, plot_data, p1, p2)
In general there doesn’t seem to be a lot of variance in mean expression between autism and control samples by gene
plot_data = data.frame('ID'=rownames(datExpr),
'ASD'=rowMeans(datExpr[,datMeta$Diagnosis=='ASD']),
'CTL'=rowMeans(datExpr[,datMeta$Diagnosis!='ASD'])) %>%
mutate(diff=ASD-CTL, abs_diff = abs(ASD-CTL)) %>%
mutate(std_diff = (diff-mean(diff))/sd(diff), distance = abs((diff-mean(diff))/sd(diff)))
plot_data %>% ggplot(aes(ASD, CTL, color = distance)) + geom_point(alpha = plot_data$abs_diff) + geom_abline(color = 'gray') +
scale_color_viridis(direction = -1) + ggtitle('Mean expression ASD vs CTL') + theme_minimal() + coord_fixed()
summary(plot_data$std_diff)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -4.39123 -0.63561 -0.01915 0.00000 0.60240 11.13999
cat(paste0('There are ', sum(plot_data$distance>3), ' genes with a difference between Diagnoses larger than 3 SD to ',
'the distance distribution of all genes'))
## There are 110 genes with a difference between Diagnoses larger than 3 SD to the distance distribution of all genes
#cat(paste0('Outlier genes: ', paste(plot_data$ID[abs(plot_data$std_diff)>3], collapse = ', ')))
There doesn’t seem to be a noticeable difference between mean expression by gene between Diagnosis groups
Samples with autism tend to have a narrower higher level of expression than the control group (as we had already seen above)
plot_data = rbind(data.frame('Mean'=rowMeans(datExpr[,datMeta$Diagnosis=='ASD']), 'Diagnosis'='ASD'),
data.frame('Mean'=rowMeans(datExpr[,datMeta$Diagnosis!='ASD']), 'Diagnosis'='CTL')) %>%
mutate('Diagnosis'=factor(Diagnosis, levels=c('CTL','ASD')))
p1 = ggplotly(plot_data %>% ggplot(aes(Mean, color=Diagnosis, fill=Diagnosis)) +
geom_density(alpha=0.3) + scale_x_log10() + theme_minimal())
plot_data = rbind(data.frame('Mean'=colMeans(datExpr[,datMeta$Diagnosis=='ASD']), 'Diagnosis'='ASD'),
data.frame('Mean'=colMeans(datExpr[,datMeta$Diagnosis!='ASD']), 'Diagnosis'='CTL')) %>%
mutate('Diagnosis'=factor(Diagnosis, levels=c('CTL','ASD')))
p2 = ggplotly(plot_data %>% ggplot(aes(Mean, color=Diagnosis, fill=Diagnosis)) +
geom_density(alpha=0.3) + theme_minimal() +
ggtitle('Mean expression by Gene (left) and by Sample (right) grouped by Diagnosis'))
subplot(p1, p2, nrows=1)
rm(p1, p2, plot_data)
The first principal component seems to separate relatively well the two diagnosis
pca = datExpr %>% t %>% prcomp
plot_data = data.frame('ID'=colnames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
left_join(datMeta, by=c('ID'='ID')) %>%
dplyr::select('ID','PC1','PC2','Diagnosis') %>%
mutate('Diagnosis'=factor(Diagnosis, levels=c('CTL','ASD')))
plot_data %>% ggplot(aes(PC1, PC2, color=Diagnosis)) + geom_point() + theme_minimal() + ggtitle('PCA') +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)'))
rm(pca, plot_data)
Looks exactly the same as the PCA visualisation, just inverting the x axis
d = datExpr %>% t %>% dist
fit = cmdscale(d, k=2)
plot_data = data.frame('ID'=colnames(datExpr), 'C1'=fit[,1], 'C2'=fit[,2]) %>%
left_join(datMeta, by=c('ID'='ID')) %>%
dplyr::select('C1','C2','Diagnosis') %>%
mutate('Diagnosis'=factor(Diagnosis, levels=c('CTL','ASD')))
plot_data %>% ggplot(aes(C1, C2, color=Diagnosis)) + geom_point() + theme_minimal() + ggtitle('MDS')
rm(d, fit, plot_data)
Higher perplexities seem to capture the difference between diagnosis better, although at the end they seem to be capturing another pattern as well, since the samples seem to be grouped in pairs
perplexities = c(2,5,10,30)
ps = list()
for(i in 1:length(perplexities)){
set.seed(123)
tsne = datExpr %>% t %>% Rtsne(perplexity=perplexities[i])
plot_data = data.frame('ID'=colnames(datExpr), 'C1'=tsne$Y[,1], 'C2'=tsne$Y[,2]) %>%
left_join(datMeta, by=c('ID'='ID')) %>%
dplyr::select('C1','C2','Diagnosis','Subject_ID') %>%
mutate('Diagnosis'=factor(Diagnosis, levels=c('CTL','ASD')))
ps[[i]] = plot_data %>% ggplot(aes(C1, C2, color=Diagnosis)) + geom_point() + theme_minimal() +
ggtitle(paste0('Perplexity=',perplexities[i])) + theme(legend.position='none')
}
grid.arrange(grobs=ps, nrow=2)
rm(ps, perplexities, tsne, i)
In the Gandal dataset, the higher perplexity values managed to capture the subject the samples belonged to, but it doesn’t seem to do it with this new dataset
ggplotly(plot_data %>% ggplot(aes(C1, C2, color=Subject_ID)) + geom_point(aes(id=Subject_ID)) + theme_minimal() +
theme(legend.position='none') + ggtitle('t-SNE Perplexity=30 coloured by Subject ID'))
rm(plot_data)
First Principal Component explains over 96% of the total variance (Less than Gandal’s)
There’s a really strong correlation between the mean expression of a gene and the 1st principal component
There is one big outlier in the 2nd principal component
pca = datExpr %>% prcomp
plot_data = data.frame( 'PC1' = pca$x[,1], 'PC2' = pca$x[,2], 'MeanExpr'=rowMeans(datExpr))
plot_data %>% ggplot(aes(PC1, PC2, color=MeanExpr)) + geom_point(alpha=0.3) + theme_minimal() +
scale_color_viridis() + ggtitle('PCA') +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)'))
rm(pca, plot_data)
Higher perplexities capture a cleaner visualisation of the data ordered by mean expression, in a similar (although not as linear) way to PCA
perplexities = c(1,2,5,10,50,100)
ps = list()
for(i in 1:length(perplexities)){
tsne = read.csv(paste0('./../Visualisations/tsne_perplexity_',perplexities[i],'.csv'))
plot_data = data.frame('C1'=tsne[,1], 'C2'=tsne[,2], 'MeanExpr'=rowMeans(datExpr))
ps[[i]] = plot_data %>% ggplot(aes(C1, C2, color=MeanExpr)) + geom_point(alpha=0.5) + theme_minimal() +
scale_color_viridis() + ggtitle(paste0('Perplexity=',perplexities[i])) +
theme(legend.position='none') + coord_fixed()
}
grid.arrange(grobs=ps, nrow=2)
rm(perplexities, ps, tsne, i)
Only 944 genes (~7% vs ~26% in Gandal’s dataset) are significant using a threshold of 0.05 for the adjusted p-value and a without a log Fold Change threshold (keeping the null hypothesis \(H_0: lfc=0\))
Many genes weren’t assigned an adjusted p-value
table(DE_info$padj<0.05, useNA='ifany')
##
## FALSE TRUE <NA>
## 10629 944 2125
p = DE_info %>% ggplot(aes(log2FoldChange, padj, color=significant)) + geom_point(alpha=0.2) +
scale_y_sqrt() + xlab('log2 Fold Change') + ylab('Adjusted p-value') + theme_minimal()
ggExtra::ggMarginal(p, type = 'density', color='gray', fill='gray', size=10)
rm(p)
There is a clear negative relation between lfc and mean expression, for both differentially expressed and not differentially expressed groups of genes
The relation is strongest for genes with low levels of expression
plot_data = data.frame('ID'=rownames(datExpr), 'meanExpr'=rowMeans(datExpr)) %>% left_join(DE_info, by='ID') %>%
mutate('statisticallySignificant' = ifelse(is.na(padj),NA, ifelse(padj<0.05, TRUE, FALSE)),
'alpha' = ifelse(padj>0.05 | is.na(padj), 0.1, 0.5))
plot_data %>% ggplot(aes(meanExpr, abs(log2FoldChange), color=statisticallySignificant)) +
geom_point(alpha = plot_data$alpha) + geom_smooth(method='lm') +
theme_minimal() + scale_y_sqrt() + theme(legend.position = 'bottom') +
xlab('Mean Expression') + ylab('abs(lfc)') + ggtitle('Log fold change by level of expression')
When filtering for differential expression, the points separate into two clouds depending on whether they are over or underexpressed
The top cloud corresponds to the under expressed genes and the bottom to the over expressed ones
datExpr_DE = datExpr[DE_info$significant,]
pca = datExpr_DE %>% prcomp
plot_data = cbind(data.frame('PC1'=pca$x[,1], 'PC2'=pca$x[,2]), DE_info[DE_info$significant==TRUE,])
pos_zero = -min(plot_data$log2FoldChange)/(max(plot_data$log2FoldChange)-min(plot_data$log2FoldChange))
p = plot_data %>% ggplot(aes(PC1, PC2, color=log2FoldChange)) + geom_point(alpha=0.5) +
scale_color_gradientn(colours=c('#F8766D','#faa49e','white','#00BFC4','#009499'),
values=c(0, pos_zero-0.1, pos_zero, pos_zero+0.1, 1)) +
theme_minimal() + ggtitle('
PCA of differentially expressed genes') + # This is on purpose, PDF doesn't save well without this white space (?)
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)')) +
theme(legend.position = 'bottom')
ggExtra::ggMarginal(p, type='density', color='gray', fill='gray', size=10)
rm(pos_zero, p)
Separating the genes into these two groups: Salmon: under-expressed, aqua: over-expressed
plot_data = plot_data %>% mutate('Group'=ifelse(log2FoldChange>0,'overexpressed','underexpressed')) %>%
mutate('Group' = factor(Group, levels=c('underexpressed','overexpressed')))
plot_data %>% ggplot(aes(PC1, PC2, color=Group)) + geom_point(alpha=0.4) +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)')) +
theme_minimal() + ggtitle('PCA')
rm(pca)
List of top DE genes
# Get genes names
getinfo = c('ensembl_gene_id','external_gene_id')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl', host='feb2014.archive.ensembl.org')
gene_names = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=plot_data$ID, mart=mart) %>%
rename(external_gene_id = 'gene_name', ensembl_gene_id = 'ID')
top_genes = plot_data %>% left_join(gene_names, by='ID') %>% arrange(-abs(log2FoldChange)) %>%
top_n(50, wt=log2FoldChange)
kable(top_genes %>% dplyr::select(ID, gene_name, log2FoldChange, padj, Neuronal, Group))
| ID | gene_name | log2FoldChange | padj | Neuronal | Group |
|---|---|---|---|---|---|
| ENSG00000256618 | MTRNR2L1 | 1.9916981 | 0.0401708 | 0 | overexpressed |
| ENSG00000137959 | IFI44L | 1.8535885 | 0.0000893 | 0 | overexpressed |
| ENSG00000126709 | IFI6 | 1.5571198 | 0.0000035 | 0 | overexpressed |
| ENSG00000224982 | TMEM233 | 1.3304682 | 0.0048144 | 0 | overexpressed |
| ENSG00000186470 | BTN3A2 | 1.1915207 | 0.0003132 | 0 | overexpressed |
| ENSG00000150337 | FCGR1A | 1.1902867 | 0.0359492 | 0 | overexpressed |
| ENSG00000196126 | HLA-DRB1 | 1.1625629 | 0.0068988 | 0 | overexpressed |
| ENSG00000090920 | FCGBP | 1.1389915 | 0.0092884 | 0 | overexpressed |
| ENSG00000188536 | HBA2 | 1.0685930 | 0.0354139 | 0 | overexpressed |
| ENSG00000206172 | HBA1 | 1.0550535 | 0.0048144 | 0 | overexpressed |
| ENSG00000187608 | ISG15 | 1.0270437 | 0.0057662 | 0 | overexpressed |
| ENSG00000167754 | KLK5 | 1.0203547 | 0.0250979 | 0 | overexpressed |
| ENSG00000185955 | C7orf61 | 0.9949872 | 0.0339392 | 0 | overexpressed |
| ENSG00000130303 | BST2 | 0.9830483 | 0.0003724 | 0 | overexpressed |
| ENSG00000133048 | CHI3L1 | 0.9654080 | 0.0259421 | 0 | overexpressed |
| ENSG00000119917 | IFIT3 | 0.9623987 | 0.0033546 | 0 | overexpressed |
| ENSG00000133321 | RARRES3 | 0.9537325 | 0.0001111 | 0 | overexpressed |
| ENSG00000107201 | DDX58 | 0.9531294 | 0.0014643 | 0 | overexpressed |
| ENSG00000181019 | NQO1 | 0.9390997 | 0.0036609 | 1 | overexpressed |
| ENSG00000197747 | S100A10 | 0.9342678 | 0.0021598 | 0 | overexpressed |
| ENSG00000033867 | SLC4A7 | 0.9330656 | 0.0000464 | 0 | overexpressed |
| ENSG00000159403 | C1R | 0.9329289 | 0.0000893 | 0 | overexpressed |
| ENSG00000122641 | INHBA | 0.9115749 | 0.0436385 | 0 | overexpressed |
| ENSG00000132274 | TRIM22 | 0.9085575 | 0.0205875 | 0 | overexpressed |
| ENSG00000140961 | OSGIN1 | 0.9041925 | 0.0194605 | 0 | overexpressed |
| ENSG00000105559 | PLEKHA4 | 0.9027897 | 0.0012494 | 0 | overexpressed |
| ENSG00000084093 | REST | 0.8858131 | 0.0037218 | 1 | overexpressed |
| ENSG00000181722 | ZBTB20 | 0.8772604 | 0.0011067 | 0 | overexpressed |
| ENSG00000182326 | C1S | 0.8729420 | 0.0022401 | 0 | overexpressed |
| ENSG00000028277 | POU2F2 | 0.8627281 | 0.0018340 | 0 | overexpressed |
| ENSG00000054179 | ENTPD2 | 0.8498669 | 0.0097728 | 0 | overexpressed |
| ENSG00000152952 | PLOD2 | 0.8421405 | 0.0332803 | 0 | overexpressed |
| ENSG00000025708 | TYMP | 0.8388151 | 0.0179278 | 0 | overexpressed |
| ENSG00000186818 | LILRB4 | 0.8384667 | 0.0273850 | 0 | overexpressed |
| ENSG00000163840 | DTX3L | 0.8378994 | 0.0208838 | 0 | overexpressed |
| ENSG00000102265 | TIMP1 | 0.8374703 | 0.0016570 | 0 | overexpressed |
| ENSG00000077264 | PAK3 | 0.8177574 | 0.0003130 | 0 | overexpressed |
| ENSG00000185885 | IFITM1 | 0.8128928 | 0.0133938 | 0 | overexpressed |
| ENSG00000068079 | IFI35 | 0.8037349 | 0.0025741 | 0 | overexpressed |
| ENSG00000067082 | KLF6 | 0.7803468 | 0.0000603 | 0 | overexpressed |
| ENSG00000070778 | PTPN21 | 0.7677702 | 0.0110275 | 0 | overexpressed |
| ENSG00000155363 | MOV10 | 0.7673794 | 0.0068575 | 0 | overexpressed |
| ENSG00000157150 | TIMP4 | 0.7649290 | 0.0183559 | 0 | overexpressed |
| ENSG00000173369 | C1QB | 0.7570775 | 0.0318371 | 0 | overexpressed |
| ENSG00000147113 | CXorf36 | 0.7558034 | 0.0173219 | 0 | overexpressed |
| ENSG00000115084 | SLC35F5 | 0.7553919 | 0.0200686 | 0 | overexpressed |
| ENSG00000102805 | CLN5 | 0.7515606 | 0.0040051 | 1 | overexpressed |
| ENSG00000167414 | GNG8 | 0.7481608 | 0.0460567 | 0 | overexpressed |
| ENSG00000002933 | TMEM176A | 0.7431177 | 0.0177139 | 0 | overexpressed |
| ENSG00000117620 | SLC35A3 | 0.7401913 | 0.0153041 | 0 | overexpressed |
rm(top_genes)
Plotting the mean expression by group in Gandal’s dataset there seemed to exist underlying distributions, so we would use GMM to separate them, but everything seems very homogeneous here, so this doesn’t seem to be necessary. (If we do it anyway we can see that they still cluster by mean expression, which makes sense since it explains the majority of the variance of the genes)
Under-expressed ASD genes tend to have a higher mean expression than over-expressed ones (same as with Gandal’s data)
gg_colour_hue = function(n) {
hues = seq(15, 375, length = n + 1)
hcl(h = hues, l = 65, c = 100)[1:n]
}
tot_n_clusters = 4
plot_data = plot_data %>% mutate('MeanExpr'=rowMeans(datExpr_DE), 'SDExpr'=apply(datExpr_DE,1,sd))
GMM_G1 = plot_data %>% filter(Group=='overexpressed') %>% dplyr::select(MeanExpr) %>% GMM(2)
GMM_G2 = plot_data %>% filter(Group=='underexpressed') %>% dplyr::select(MeanExpr) %>% GMM(2)
memberships_G1 = data.frame('ID'=plot_data$ID[plot_data$Group=='overexpressed'],
'Membership'=GMM_G1$Log_likelihood %>%
apply(1, function(x) glue('over_', which.max(x))))
memberships_G2 = data.frame('ID'=plot_data$ID[plot_data$Group=='underexpressed'],
'Membership'=GMM_G2$Log_likelihood %>%
apply(1, function(x) glue('under_', which.max(x))))
plot_data = rbind(memberships_G1, memberships_G2) %>% left_join(plot_data, by='ID')
p1 = plot_data %>% ggplot(aes(x=MeanExpr, color=Group, fill=Group)) + geom_density(alpha=0.4) +
theme_minimal() + theme(legend.position='bottom')
p2 = plot_data %>% ggplot(aes(x=MeanExpr)) +
stat_function(fun=dnorm, n=100, colour=gg_colour_hue(tot_n_clusters)[1],
args=list(mean=GMM_G1$centroids[1], sd=GMM_G1$covariance_matrices[1])) +
stat_function(fun=dnorm, n=100, colour=gg_colour_hue(tot_n_clusters)[2],
args=list(mean=GMM_G1$centroids[2], sd=GMM_G1$covariance_matrices[2])) +
stat_function(fun=dnorm, n=100, colour=gg_colour_hue(tot_n_clusters)[3],
args=list(mean=GMM_G2$centroids[1], sd=GMM_G2$covariance_matrices[1])) +
stat_function(fun=dnorm, n=100, colour=gg_colour_hue(tot_n_clusters)[4],
args=list(mean=GMM_G2$centroids[2], sd=GMM_G2$covariance_matrices[2])) +
theme_minimal()
p3 = plot_data %>% ggplot(aes(PC1, PC2, color=Membership)) + geom_point(alpha=0.4) + theme_minimal() +
theme(legend.position='bottom')
grid.arrange(p1, p2, p3, nrow=1)
rm(gg_colour_hue, GMM_G1, GMM_G2, memberships_G1, memberships_G2, p1, p2, p3, tot_n_clusters)
For previous preprocessing pipelines, the pattern found above was also present in the standard deviation, but there doesn’t seem to be any distinguishable patterns now. This could be because the variance was almost homogenised with the vst normalisation algorithm.
plot_data %>% ggplot(aes(x=SDExpr, color=Group, fill=Group)) + geom_density(alpha=0.4) + theme_minimal()
rm(plot_data)
fc_list = seq(1, 1.1, 0.003)
n_genes = nrow(datExpr)
# Calculate PCAs
datExpr_pca_samps = datExpr %>% data.frame %>% t %>% prcomp(scale.=TRUE)
datExpr_pca_genes = datExpr %>% data.frame %>% prcomp(scale.=TRUE)
# Initialise DF to save PCA outputs
pcas_samps = datExpr_pca_samps$x %>% data.frame %>% dplyr::select(PC1:PC2) %>%
mutate('ID'=colnames(datExpr), 'fc'=0, PC1=scale(PC1), PC2=scale(PC2))
pcas_genes = datExpr_pca_genes$x %>% data.frame %>% dplyr::select(PC1:PC2) %>%
mutate('ID'=rownames(datExpr), 'fc'=0, PC1=scale(PC1), PC2=scale(PC2))
pca_samps_old = pcas_samps
pca_genes_old = pcas_genes
for(fc in fc_list){
# Recalculate DE_info with the new threshold (p-values change) an filter DE genes
DE_genes = results(dds, lfcThreshold=log2(fc), altHypothesis='greaterAbs') %>% data.frame %>%
mutate('ID'=rownames(.)) %>% filter(padj<0.05)
datExpr_DE = datExpr %>% data.frame %>% filter(rownames(.) %in% DE_genes$ID)
n_genes = c(n_genes, nrow(DE_genes))
# Calculate PCAs
datExpr_pca_samps = datExpr_DE %>% t %>% prcomp(scale.=TRUE)
datExpr_pca_genes = datExpr_DE %>% prcomp(scale.=TRUE)
# Create new DF entries
pca_samps_new = datExpr_pca_samps$x %>% data.frame %>% dplyr::select(PC1:PC2) %>%
mutate('ID'=colnames(datExpr), 'fc'=fc, PC1=scale(PC1), PC2=scale(PC2))
pca_genes_new = datExpr_pca_genes$x %>% data.frame %>% dplyr::select(PC1:PC2) %>%
mutate('ID'=DE_genes$ID, 'fc'=fc, PC1=scale(PC1), PC2=scale(PC2))
# Change PC sign if necessary
if(cor(pca_samps_new$PC1, pca_samps_old$PC1)<0) pca_samps_new$PC1 = -pca_samps_new$PC1
if(cor(pca_samps_new$PC2, pca_samps_old$PC2)<0) pca_samps_new$PC2 = -pca_samps_new$PC2
if(cor(pca_genes_new[pca_genes_new$ID %in% pca_genes_old$ID,]$PC1,
pca_genes_old[pca_genes_old$ID %in% pca_genes_new$ID,]$PC1)<0){
pca_genes_new$PC1 = -pca_genes_new$PC1
}
if(cor(pca_genes_new[pca_genes_new$ID %in% pca_genes_old$ID,]$PC2,
pca_genes_old[pca_genes_old$ID %in% pca_genes_new$ID,]$PC2 )<0){
pca_genes_new$PC2 = -pca_genes_new$PC2
}
pca_samps_old = pca_samps_new
pca_genes_old = pca_genes_new
# Update DFs
pcas_samps = rbind(pcas_samps, pca_samps_new)
pcas_genes = rbind(pcas_genes, pca_genes_new)
}
# Add Diagnosis/SFARI score information
pcas_samps = pcas_samps %>% left_join(datMeta, by=c('ID')) %>%
dplyr::select(ID, PC1, PC2, fc, Diagnosis, brain_lobe)
# pcas_genes = pcas_genes %>% left_join(SFARI_genes, by='ID') %>%
# mutate('score'=as.factor(`gene-score`)) %>%
# dplyr::select(ID, PC1, PC2, lfc, score)
# Plot change of number of genes
ggplotly(data.frame('fc'=fc_list, 'n_genes'=n_genes[-1]) %>% ggplot(aes(x=fc, y=n_genes)) +
geom_point() + geom_line() + theme_minimal() + xlab('Fold Change') +
ggtitle('Number of remaining genes when modifying filtering threshold'))
rm(fc_list, n_genes, fc, pca_samps_new, pca_genes_new, pca_samps_old, pca_genes_old,
datExpr_pca_samps, datExpr_pca_genes)
Note: PC values get smaller as Log2 fold change increases, so on each iteration the values were scaled so it would be easier to compare between frames
The lfc threshold doesn’t seem to make a big difference for differentiating genes by Diagnosis
ggplotly(pcas_samps %>% ggplot(aes(PC1, PC2, color=Diagnosis)) + geom_point(aes(frame=fc, ids=ID)) +
theme_minimal() + ggtitle('Samples PCA plot modifying filtering threshold'))
There doesn’t seem to be any strong recognisable pattern, perhaps the samples from the Occipital lobe have lower PC1 values than the ones from the Frontal lobe
ggplotly(pcas_samps %>% ggplot(aes(PC1, PC2, color=brain_lobe)) + geom_point(aes(frame=fc, ids=ID)) +
theme_minimal() + ggtitle('Samples PCA plot modifying filtering threshold'))
if(!'fcSign' %in% colnames(pcas_genes)){
pcas_genes = pcas_genes %>% left_join(DE_info, by='ID') %>% mutate(fcSign = ifelse(log2FoldChange>0,'Over-expressed','Under-expressed'))
}
ggplotly(pcas_genes %>% mutate(abs_lfc=ifelse(fc==-1,-1,round(log2(fc),3))) %>%
ggplot(aes(PC1, PC2, color=fcSign)) + geom_point(aes(frame=abs_lfc, ids=ID, alpha=0.1)) +
theme_minimal() + ggtitle('Genes PCA plot modifying |LFC| filtering threshold'))
sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] knitr_1.28 ClusterR_1.2.1
## [3] gtools_3.8.2 Rtsne_0.15
## [5] DESeq2_1.24.0 SummarizedExperiment_1.14.1
## [7] DelayedArray_0.10.0 BiocParallel_1.18.1
## [9] matrixStats_0.56.0 Biobase_2.44.0
## [11] GenomicRanges_1.36.1 GenomeInfoDb_1.20.0
## [13] IRanges_2.18.3 S4Vectors_0.22.1
## [15] BiocGenerics_0.30.0 biomaRt_2.40.5
## [17] GGally_1.5.0 gridExtra_2.3
## [19] viridis_0.5.1 viridisLite_0.3.0
## [21] RColorBrewer_1.1-2 plotlyutils_0.0.0.9000
## [23] plotly_4.9.2 glue_1.3.2
## [25] reshape2_1.4.3 forcats_0.5.0
## [27] stringr_1.4.0 dplyr_0.8.5
## [29] purrr_0.3.3 readr_1.3.1
## [31] tidyr_1.0.2 tibble_3.0.0
## [33] ggplot2_3.3.0 tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.4-1 ellipsis_0.3.0 htmlTable_1.13.3
## [4] XVector_0.24.0 base64enc_0.1-3 fs_1.4.0
## [7] rstudioapi_0.11 farver_2.0.3 bit64_0.9-7
## [10] AnnotationDbi_1.46.1 fansi_0.4.1 lubridate_1.7.4
## [13] xml2_1.2.5 splines_3.6.3 geneplotter_1.62.0
## [16] Formula_1.2-3 jsonlite_1.6.1 annotate_1.62.0
## [19] broom_0.5.5 cluster_2.1.0 dbplyr_1.4.2
## [22] png_0.1-7 shiny_1.4.0.2 compiler_3.6.3
## [25] httr_1.4.1 backports_1.1.5 fastmap_1.0.1
## [28] assertthat_0.2.1 Matrix_1.2-18 lazyeval_0.2.2
## [31] cli_2.0.2 later_1.0.0 acepack_1.4.1
## [34] htmltools_0.4.0 prettyunits_1.1.1 tools_3.6.3
## [37] gmp_0.5-13.6 gtable_0.3.0 GenomeInfoDbData_1.2.1
## [40] Rcpp_1.0.4 cellranger_1.1.0 vctrs_0.2.4
## [43] nlme_3.1-144 crosstalk_1.1.0.1 xfun_0.12
## [46] rvest_0.3.5 mime_0.9 miniUI_0.1.1.1
## [49] lifecycle_0.2.0 XML_3.99-0.3 zlibbioc_1.30.0
## [52] scales_1.1.0 promises_1.1.0 hms_0.5.3
## [55] curl_4.3 yaml_2.2.1 memoise_1.1.0
## [58] rpart_4.1-15 ggExtra_0.9 reshape_0.8.8
## [61] latticeExtra_0.6-29 stringi_1.4.6 RSQLite_2.2.0
## [64] highr_0.8 genefilter_1.66.0 checkmate_2.0.0
## [67] rlang_0.4.5 pkgconfig_2.0.3 bitops_1.0-6
## [70] evaluate_0.14 lattice_0.20-40 labeling_0.3
## [73] htmlwidgets_1.5.1 bit_1.1-15.2 tidyselect_1.0.0
## [76] plyr_1.8.6 magrittr_1.5 R6_2.4.1
## [79] generics_0.0.2 Hmisc_4.4-0 DBI_1.1.0
## [82] mgcv_1.8-31 pillar_1.4.3 haven_2.2.0
## [85] foreign_0.8-75 withr_2.1.2 survival_3.1-11
## [88] RCurl_1.98-1.1 nnet_7.3-13 modelr_0.1.6
## [91] crayon_1.3.4 rmarkdown_2.1 jpeg_0.1-8.1
## [94] progress_1.2.2 locfit_1.5-9.4 grid_3.6.3
## [97] readxl_1.3.1 data.table_1.12.8 blob_1.2.1
## [100] reprex_0.3.0 digest_0.6.25 xtable_1.8-4
## [103] httpuv_1.5.2 munsell_0.5.0